In this sesseion, you make the circuit simulation run on GPUs. We are going to see that mapping decisions do not interfere with the correctness of the application (modulo mapping failures from incorrect mapping decisions).
The task of this exercise is to write some mapping rules to map the three simulation tasks to GPUs. To write mapping rules, you use a new mapper language called Bishop. In the syntax guide below, you can find how you can select a set of tasks (regions) and set their target to a particular processor (memory). We are giving you a mapping rule for task 'update_voltages' that maps the task to CPUs (more precisely, processors that supports x86 ISA). Based on this example and the syntax guide given below, you write
calculate_new_currents
, distribute_charges
, and update_voltages
to GPUs (i.e. processors that support CUDA ISA)distribute_charges
to even numbered GPUs and task update_voltages
to odd numbered GPUs (you can assume that there are even number of GPUs).With all your mapping rules written correctly, you should see the solution passes validation.
In [ ]:
__demand(__cuda) task T ... -- Generates both x86 and CUDA variants for task T
bishop ... end -- Starts a bishop mapper
TE { target : V; } -- Sets value V as the target of a task that matches TE
TE RE { target : V; } -- Sets value V as the target of a region that matches RE and whose task matches TE
-- Task Element (TE)
task -- Selects any tasks
task#T -- Selects tasks named T
task[isa=I] -- Selects tasks mapped to a processor that supports ISA I
TE[target=$T] -- Selects tasks that satisfy TE and then binds their target to $T
TE[index=$P] -- Selects tasks that satisfy TE and then binds their point in the launch domain to $P
-- Region Element (RE)
region -- Selects any regions
region#P -- Selects regions named P in the signature
-- Processor objects
processors -- A list of processors in the whole system
processors[isa=I] -- A list of processors that support ISA I (either x86 or cuda)
processors[N] -- The N-th processor in the list
L.size -- The size of list L of processors
P.memories -- A list of memories visible to processor P
-- Memory objects
memories -- A list of memories in the whole system
memories[kind=K] -- A list of memories of kind K (sysmem, regmem, fbmem, or zcmem)
memories[N] -- The N-th memory in the list
L.size -- The size of list L of memories
-- Expressions for list indices
$P[0] -- The first coordinate of point $P
E1 + E2, E1 - E2, E1 * E2, E1 / E2, E1 % E2 -- Usual integer arithmetic expressions
In [ ]:
import "regent"
import "bishop"
local c = regentlib.c
struct Currents {
_0 : float,
_1 : float,
_2 : float,
}
struct Voltages {
_1 : float,
_2 : float,
}
fspace Node {
capacitance : float,
leakage : float,
charge : float,
voltage : float,
}
fspace Wire(rpn : region(Node), rsn : region(Node), rgn : region(Node)) {
in_node : ptr(Node, rpn, rsn),
out_node : ptr(Node, rpn, rsn, rgn),
inductance : float,
resistance : float,
capacitance : float,
current : Currents,
voltage : Voltages,
}
local CktConfig = require("session1/circuit_config")
local helper = require("session2/circuit_helper")
local validator = require("session2/circuit_validator")
local WS = 3
local dT = 1e-7
mapper
task#update_voltages
{
target : processors[isa=x86];
}
-- TODO: Write mapping rules that map the three simulation tasks to GPUs.
-- You might also want to try solving the bonus problem above.
task
{
target : ;
}
-- TODO: Write mapping rules that map regions of the three simulation tasks to a zero-copy memory.
task region
{
target : ;
}
end
__demand(__cuda)
task calculate_new_currents(steps : uint,
rpn : region(Node),
rsn : region(Node),
rgn : region(Node),
rw : region(Wire(rpn, rsn, rgn)))
where
reads(rpn.voltage, rsn.voltage, rgn.voltage,
rw.{in_node, out_node, inductance, resistance, capacitance}),
reads writes(rw.{current, voltage})
do
var rdT : float = 1.0 / dT
__demand(__vectorize)
for w in rw do
var temp_v : float[WS + 1]
var temp_i : float[WS]
var old_i : float[WS]
var old_v : float[WS - 1]
temp_i[0] = w.current._0
temp_i[1] = w.current._1
temp_i[2] = w.current._2
for i = 0, WS do old_i[i] = temp_i[i] end
temp_v[1] = w.voltage._1
temp_v[2] = w.voltage._2
for i = 0, WS - 1 do old_v[i] = temp_v[i + 1] end
-- Pin the outer voltages to the node voltages.
temp_v[0] = w.in_node.voltage
temp_v[WS] = w.out_node.voltage
-- Solve the RLC model iteratively.
var L : float = w.inductance
var rR : float = 1.0 / w.resistance
var rC : float = 1.0 / w.capacitance
for j = 0, steps do
-- First, figure out the new current from the voltage differential
-- and our inductance:
-- dV = R*I + L*I' ==> I = (dV - L*I')/R
for i = 0, WS do
temp_i[i] = ((temp_v[i + 1] - temp_v[i]) -
(L * (temp_i[i] - old_i[i]) * rdT)) * rR
end
-- Now update the inter-node voltages.
for i = 0, WS - 1 do
temp_v[i + 1] = old_v[i] + dT * (temp_i[i] - temp_i[i + 1]) * rC
end
end
-- Write out the results.
w.current._0 = temp_i[0]
w.current._1 = temp_i[1]
w.current._2 = temp_i[2]
w.voltage._1 = temp_v[1]
w.voltage._2 = temp_v[2]
end
end
__demand(__cuda)
task distribute_charge(rpn : region(Node),
rsn : region(Node),
rgn : region(Node),
rw : region(Wire(rpn, rsn, rgn)))
where
reads(rw.{in_node, out_node, current._0, current._2}),
reduces +(rpn.charge, rsn.charge, rgn.charge)
do
for w in rw do
var in_current = -dT * w.current._0
var out_current = dT * w.current._2
w.in_node.charge += in_current
w.out_node.charge += out_current
end
end
__demand(__cuda)
task update_voltages(rn : region(Node))
where
reads(rn.{capacitance, leakage}),
reads writes(rn.{voltage, charge})
do
for n in rn do
var voltage = n.voltage + n.charge / n.capacitance
voltage = voltage * (1.0 - n.leakage)
n.voltage = voltage
n.charge = 0.0
end
end
task toplevel()
var conf : CktConfig
conf:initialize_from_command()
conf:show()
var num_circuit_nodes = conf.num_pieces * conf.nodes_per_piece
var num_circuit_wires = conf.num_pieces * conf.wires_per_piece
var rn = region(ispace(ptr, num_circuit_nodes), Node)
var rw = region(ispace(ptr, num_circuit_wires), Wire(wild, wild, wild))
new(ptr(Node, rn), num_circuit_nodes)
new(ptr(Wire(wild, wild, wild), rw), num_circuit_wires)
c.printf("Generating a random circuit...\n")
helper.generate_random_circuit(rn, rw, conf)
var colors = ispace(int1d, conf.num_pieces)
var pn_equal = partition(equal, rn, colors)
var pw = preimage(rw, pn_equal, rw.in_node)
var pn_extrefs = image(rn, preimage(rw, pn_equal, rw.out_node) - pw, rw.out_node)
var pn_private = pn_equal - pn_extrefs
var pn_shared = pn_equal & pn_extrefs
var pn_ghost = image(rn, pw, rw.out_node) - pn_equal
__demand(__parallel)
for i = 0, conf.num_pieces do
helper.initialize_pointers(pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
end
helper.wait_for(helper.block(rn, rw))
--helper.dump_graph(conf, rn, rw)
c.printf("Starting main simulation loop\n")
var ts_start = helper.timestamp()
for j = 0, conf.num_loops do
for i = 0, conf.num_pieces do
calculate_new_currents(conf.steps, pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
end
for i = 0, conf.num_pieces do
distribute_charge(pn_private[i], pn_shared[i], pn_ghost[i], pw[i])
end
for i = 0, conf.num_pieces do
update_voltages(pn_equal[i])
end
end
-- Wait for all previous tasks to complete and measure the elapsed time.
var _ = 0
for i = 0, conf.num_pieces do
_ += helper.block(pn_equal[i], pw[i])
end
helper.wait_for(_)
var ts_end = helper.timestamp()
c.printf("simulation complete\n")
var sim_time = 1e-6 * (ts_end - ts_start)
c.printf("ELAPSED TIME = %7.3f s\n", sim_time)
var gflops =
helper.calculate_gflops(sim_time, WS * 6 + (WS - 1) * 4, 4, 4, conf)
c.printf("GFLOPS = %7.3f GFLOPS\n", gflops)
c.printf("Validating simulation results...\n")
validator.validate_solution(rn, rw, conf)
end
bishoplib.register_bishop_mappers()
regentlib.start(toplevel)
Next up: try GPU mapping on a larger problem size.